** Clearing Stata memory
capture log close
clear all
set more off, perm
set seed 1234

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////// Table O.48: Blinder-Oaxaca Decomposition: Log (Annual Wages) between Seven and 12 Years After Admission Exam //////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

** Opening Phase 2 norm_scores dataset 
use "Work Data/Gender_Phase2_long.dta",clear

*** Creating variables
encode subject, gen (sub)
tab subject, gen (d_sub)
label var sub "Subject"

** Subject dummies
rename d_sub1 Biology
rename d_sub2 Chemistry
rename d_sub3 Geography
rename d_sub4 History
rename d_sub5 Language
rename d_sub6 Mathematics
rename d_sub7 Physics
rename d_sub8 Portuguese
* Labels
label var Biology "Biology"
label var Chemistry "Chemistry"
label var Geography "Geography"
label var History "History"
label var Math "Mathematics"
label var Physics "Physics"
label var Portuguese "Portuguese"
label var Language "Foreign Language"

** Interaction: priority X female
gen fem_priority=female*priority
label var fem_priority "Female $\times$ Priority"

** Interaction: priority X subject
foreach v of varlist Biology-Portuguese {
gen fem_`v'=`v'*female
label var fem_`v' "Female $\times$ `v'"
gen prio_`v'=priority*`v'
label var prio_`v' "Priority $\times$ `v'"
gen fem_prio_`v'=fem_priority*`v'
label var fem_prio_`v' "Female $\times$ Priority $\times$ `v'"
}

global subject "Chemistry Geography History Mathematics Physics"
global subject_fem "fem_Chemistry fem_Geography fem_History fem_Mathematics fem_Physics"

** P1 scores: P1 normalized subject-specific scores
forvalues i=2(1)4 {
gen norm_p1score`i'=norm_p1score^`i'
sum norm_p1score`i'
}

*********************************************************************************
****************   Relative performances ****************************************
*********************************************************************************

******************************** ENEM ********************************

foreach v in norm_enem_w {
bys year female: egen `v'_ave_g=mean(`v')
gen `v'_g=`v'-`v'_ave_g
bys year female: sum `v'_g
}
drop norm_enem_w_ave_g

* Priority x relative performance in ENEM:
foreach v in norm_enem_w {
gen `v'_priority_g=`v'_g*priority
forvalues i=2(1)4 {
gen `v'_priority_g`i'=`v'_g^`i'*priority
sum `v'_priority_g`i'
}
}

global g_norm_enem_w_prio norm_enem_w_priority_g*
d $g_norm_enem_w_prio

* Interaction: subject X ENEM
foreach v of varlist Biology-Portuguese {
gen enem_`v'=`v'*norm_enem_w_g
label var enem_`v' "ENEM $\times$ `v'"
gen fem_enem_`v'=female*norm_enem_w_g*`v'
label var fem_enem_`v' "Female $\times$ ENEM $\times$ `v'"
forvalues i=2(1)4 {
gen enem_`v'_`i'=enem_`v'^`i'
gen fem_enem_`v'_`i'=fem_enem_`v'^`i'
}
sum enem_`v'* fem_enem_`v'*
}

global g_pol_enem_sub "enem_Chemistry* enem_Geography* enem_History* enem_Mathematics* enem_Physics*"
d $g_pol_enem_sub

******************************** Phase 1 scores ********************************

foreach v in norm_p1score {

tab year, sum(`v')
bys year subject female: egen gs_`v'_ave=mean(`v')
gen gs_`v'=`v'-gs_`v'_ave
bys year female subject: sum gs_`v'
drop gs_`v'_ave

forvalues i=2(1)4 {
gen gs_`v'`i'=gs_`v'^`i'
sum gs_`v'`i'
}

global gs_pol_`v' gs_`v' gs_`v'2 gs_`v'3 gs_`v'4
d $gs_pol_`v'

* Priority x Phase 1 scores:
gen gs_`v'_prio=gs_`v'*priority
forvalues i=2(1)4 {
gen gs_`v'_prio`i'=gs_`v'`i'*priority
sum gs_`v'_prio*
}
}

global gs_pol_norm_p1score_prio gs_norm_p1score_prio*
d $gs_pol_norm_p1score_prio

*********************************************************************************
**************** Main sample ****************************************************
*********************************************************************************

* 1) Only years before the affirmative action took place
drop if aa_year==1
tab year
drop if year==2000
tab year

* 2) Drop Portuguese and Foreign Language (in Phase 1 there is no Portuguese or Foreign Language exams - For Portuguese Phase 1 has an essay)
 tab subject, sum(norm_p1score)
 drop if subject=="lang" | subject=="port" 
 tab subject, sum(norm_p1score)
 drop Language Portuguese prio_Language prio_Portuguese fem_prio_Language fem_prio_Portuguese 


********************************************************************************************************
****************  Step 1: In the first step we obtain initial estimates and store them  ****************
********************************************************************************************************

** # 1 - Main specification, exclude interaction term
reghdfe norm_score priority $subject $subject_fem $g_pol_enem_sub $g_norm_enem_w_prio $gs_pol_norm_p1score $gs_pol_norm_p1score_prio, cluster(inscri2) absorb(inscri2) resid
** # 2 - Save residuals 
predict residuals, resid
** # 3 - Create measure of relative priority performance
ttest residuals, by(female)
ttest residuals if priority==1, by(female)
sum residuals if female==1 & priority==1
bys inscri2: egen res_prio_ave=mean(residuals) if priority==1
bys inscri2: egen res_prio_average=min(res_prio_ave)
mdesc res_prio_average
tab career_choice if  res_prio_average==.
bys inscri2: egen res_nonprio_ave=mean(residuals) if priority==0
bys inscri2: egen res_nonprio_average=min(res_nonprio_ave)
mdesc res_nonprio_average
gen res_diff=res_prio_average-res_nonprio_average
mdesc res_diff
ttest res_diff, by(female)

collapse (mean) res_diff female enem norm_enem_w gen_ques_st1 essay_st1 total_st1 career_choice /* expected_duration* */ year, by(inscri2)

count if res_diff==.

 ***************************************************
 ****************** Wages RAIS *********************
 ***************************************************

count
merge 1:1 inscri2 using "Work Data/RAIS_cleaned.dta"
drop if _merge==2
tab _merge  

** # 4 - Wage measures: 

sum mwagetot* decwagetot*

forvalues i=7(1)12 {
gen yearafter`i'=year+`i'
tab yearafter`i', mi
gen annual_wage_after`i'=.
levelsof yearafter`i', local(levels) 
foreach l of local levels {
replace annual_wage_after`i'=mwagetot`l' if yearafter`i'==`l'
sum  annual_wage_after`i' 
}
}

**** Average and maximum wage

foreach x in annual_wage {
    
* Average
egen avg_`x'_712=rowmean(`x'_after7 `x'_after8 `x'_after9 `x'_after10 `x'_after11 `x'_after12) // 7-12 years after admission exam
sum avg_`x'*
sum avg_`x'* if _merge==1

* Maximum
egen max_`x'_712=rowmax(`x'_after7 `x'_after8 `x'_after9 `x'_after10 `x'_after11 `x'_after12) // 7-12 years after admission exam
sum max_`x'*
sum max_`x'* if _merge==1
}

* Log variables
foreach v of varlist annual_wage_after*  max* avg*  {
 gen l_`v'=log(`v')
 sum l_`v'
}

foreach v of varlist res_diff {
sum `v'
gen mean_`v'=r(mean)
gen sd_`v'=r(sd)
gen norm_`v'=(`v'-mean_`v')/sd_`v'
sum  norm_`v'
}

********************************************************************************************************
********************************  Step 2: Decomposition wage regressions  ******************************
********************************************************************************************************

label var norm_enem_w "Norm. ENEM scores"
label var norm_res_diff "Relative priority performance"

tab year, gen(dyear)
tab career_choice, gen(dcareer)


foreach v of varlist l_*_annual_wage_712 {

**** Without major choice FE

* year FE
xi: oaxaca `v' norm_res_diff (year:dyear*), by(female) weight(1) categorical(dyear*) // b in interaction  is the male coefficient (the version we want)
estimates store `v'bm1
estadd ysumm

* Include ENEM
xi: oaxaca `v' norm_res_diff norm_enem_w  (year:dyear*),  by(female) weight(1) categorical(dyear*) // b in interaction  is the male coefficient (the version we want)
estimates store `v'bm2
estadd ysumm

**** Control for major choice FE

* year FE
xi: oaxaca `v' norm_res_diff (year:dyear*) (career:dcareer*), by(female) weight(1) categorical(dyear*,dcareer*) // b in interaction  is the male coefficient (the version we want)
estimates store `v'bm3
estadd ysumm

* Include ENEM
xi: oaxaca `v' norm_res_diff norm_enem_w (year:dyear*) (career:dcareer*), by(female) weight(1) categorical(dyear*,dcareer*) // b in interaction  is the male coefficient (the version we want)
estimates store `v'bm4
estadd ysumm

estadd local program "Yes": `v'*3  `v'*4
estadd local program "No": `v'*1  `v'*2 
estadd local year "Yes": `v'*

}



* Interaction male coefficients
esttab l_avg_annual_wage_712bm* l_max_annual_wage_712bm* using "Output\Oaxaca_Wages_RAIS_CoefMen.tex", label drop(year _cons) se star(* 0.10 ** 0.05 *** 0.01) nomtitle f stats(N ymean year , fmt( %9.0fc %7.3f %3s) labels("Number of observations"  "Mean dependent variable" "Exam year FE")) b(%7.3f) se(%7.3f) replace  collabels(none) nogaps booktabs mgroups("Average" "Maximum", pattern(1 0 0 0 1 0 0 0) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) rename(group_1 "Men" group_2 "Women" career "Major FE" explained "Explained" unexplained "Unexplained" overall "Overall")





